import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv('/Users/danielluo/Downloads/crime-in-vancouver/crime.csv')
data.head()
pd.unique(data.TYPE)
data[data.TYPE == 'Homicide']
Lets create a pivot table of neighborhood count of each crime. Note that for Homicides and Personal Offences, location is not recorded, so it isn't included in the following pivot tables.
data['counter'] = 1
neighborhood_crime = pd.pivot_table(data, values = 'counter', index = 'NEIGHBOURHOOD', columns = 'TYPE', aggfunc='count')
neighborhood_crime['All Crime Count'] = neighborhood_crime.sum(axis=1)
neighborhood_crime.sort_values('All Crime Count')[::-1]
neighborhood_crime_percent = round(neighborhood_crime.iloc[:,:9].div(neighborhood_crime['All Crime Count'], axis = 0) * 100, 2)
neighborhood_crime_percent['All Crime Count'] = neighborhood_crime['All Crime Count']
neighborhood_crime_percent.sort_values('All Crime Count')[::-1]
Let me guess which areas are sketchy...
Shaughnessy, Arbutus Ridge, Oakridge, Kerrisdale have a high number of residential break-ins. This indicates to me that either they're just highly residential (and so the only crime that exists is residential break ins). Or that they're just high crime areas.
Fairview, Strathcona, and Mount Pleasant seem like commercial areas that are relatively suss.
Stanley Park, Musqueam seem like nice areas to live in.
Let's look into whether theres a certain time where more crimes occur.
neighborhood_crime_time = pd.pivot_table(data, values = 'HOUR', index = 'NEIGHBOURHOOD', columns = 'TYPE', aggfunc='mean')
neighborhood_crime_time['All Crime Count'] = neighborhood_crime['All Crime Count']
neighborhood_crime_time.sort_values('All Crime Count')[::-1]
neighborhood_crime_time = pd.pivot_table(data, values = 'HOUR', index = 'NEIGHBOURHOOD', columns = 'TYPE', aggfunc='median')
neighborhood_crime_time['All Crime Count'] = neighborhood_crime['All Crime Count']
neighborhood_crime_time.sort_values('All Crime Count')[::-1]
df = data[['HOUR', 'NEIGHBOURHOOD']]
f, axes = plt.subplots(2, 2, figsize=(7, 7), sharex=True)
sns.distplot( df[df.NEIGHBOURHOOD == "Central Business District"].HOUR , color="skyblue", ax=axes[0, 0])
sns.distplot( df[df.NEIGHBOURHOOD == "West End"].HOUR , color="olive", ax=axes[0, 1])
sns.distplot( df[df.NEIGHBOURHOOD == "Fairview"].HOUR , color="gold", ax=axes[1, 0])
sns.distplot( df[df.NEIGHBOURHOOD == "Mount Pleasant"].HOUR , color="teal", ax=axes[1, 1])
Looks like crime follows a similar path. It peaks at around 17-18 which is like 5-6PM, and there's also another peak at midnight.
Let's try to put crime density on the map. Looks like some of latitude longitude data isn't right or at least is really far from vancouver so I cleaned it out.
cleaned = data[data.Longitude != 0]
print(cleaned.shape)
cleaned = cleaned[cleaned.Longitude > -123.797726]
print(cleaned.shape)
cleaned = cleaned[cleaned.Latitude > 49.118175]
print(cleaned.shape)
cleaned.dtypes
cleaned.TYPE = cleaned.TYPE.astype('category').cat.codes
cleaned.NEIGHBOURHOOD = cleaned.NEIGHBOURHOOD.astype('category').cat.codes
print(min(cleaned.Latitude), ", ", min(cleaned.Longitude))
print(max(cleaned.Latitude), ", ", max(cleaned.Longitude))
from matplotlib.pyplot import figure
figure(num=None, figsize=(6.13,2.75), dpi=100, facecolor='w', edgecolor='k')
img = plt.imread('/Users/danielluo/data/vancouverpic2.png', 0)
plt.imshow(img, zorder=0, aspect = 'auto',extent=[-123.223955, -122.8445974, 49.20089685,49.31334872])
sample = cleaned.sample(1000)
plt.scatter(sample['Longitude'],sample['Latitude'], c = sample['TYPE'], s = 5)
plt.colorbar()
plt.title("Crime by Location")
plt.show()
figure(num=None, figsize=(6.13,2.75), dpi=100, facecolor='w', edgecolor='k')
img = plt.imread('/Users/danielluo/data/vancouverpic2.png', 0)
plt.imshow(img, zorder=0, aspect = 'auto',extent=[-123.223955, -122.8445974, 49.20089685,49.31334872])
sample = cleaned.sample(1000)
plt.scatter(sample['Longitude'],sample['Latitude'], c = sample['NEIGHBOURHOOD'], s = 5)
plt.colorbar()
plt.title("Crime by Location")
plt.show()
neighbourhoods_alpha = pd.Series(pd.unique(data.NEIGHBOURHOOD)).sort_values().reset_index().iloc[:,1]
neighbourhoods_alpha
I needed to find the area of each neighbourhood. Unfortunately I ended just finding them one by one so it was a bit challenging but it wasn't that bad. Courtesy of the vancouver website
neighbourhood_areas = pd.Series([370,
370,
856,
327,
445,
793,
724,
631,
664,
546,
559,
366,
125,
401,
805,
491,
446,
217,
405,
388,
626,
531,
198,
445,
0])
neighbourhood_areas.replace(0, np.NaN)
neighbourhood_areas
area_neighbourhood = pd.concat([neighbourhoods_alpha, neighbourhood_areas], axis=1, keys = ['NEIGHBOURHOOD', 'AREA']).set_index('NEIGHBOURHOOD')
area_neighbourhood
neighbourhood_crime_area = neighborhood_crime.join(area_neighbourhood)
neighbourhood_crime_area['density'] = neighbourhood_crime_area['All Crime Count'] / neighbourhood_crime_area.AREA
neighbourhood_crime_area.sort_values('density')[::-1]
The most dense crime is Central Business District and West End by a large margin. These are likely high density areas with a lot of people in a small area. Maybe we should look at it in terms of population too. That a lot work tho
# Central Business District
downtown = data[data.NEIGHBOURHOOD == 'Central Business District'].groupby('YEAR').agg('count').iloc[:,11]
strathcona = data[data.NEIGHBOURHOOD == 'Strathcona'].groupby('YEAR').agg('count').iloc[:,11]
mt_pleasant = data[data.NEIGHBOURHOOD == 'Mount Pleasant'].groupby('YEAR').agg('count').iloc[:,11]
ax = downtown.plot(label = 'Downtown')
strathcona.plot(label = "Strathcona", ax=ax)
mt_pleasant.plot(label = 'Mt Pleasant', ax=ax)
plt.legend()
plt.title('Number of Crimes from 2003 to 2017')
plt.show()
Why is there such a high rise in crime in 2016.
Let's see if this trend is true for the rest of the neighbourhoods. I'm thinking maybe there was a rise in population?
neighbourhood_time_crime = pd.pivot_table(data, values = 'counter', index = 'YEAR', columns = 'NEIGHBOURHOOD', aggfunc = 'sum')
ax = neighbourhood_time_crime.plot(title = 'Crime Count by Years')
ax.get_legend().set_bbox_to_anchor((1, 1))
area_neighbourhood
area_neighbourhood = area_neighbourhood.drop(area_neighbourhood.tail(1).index)
area_neighbourhood
area_neighbourhood
time_density_neighbourhood = pd.pivot_table(data, values = 'counter', index = 'NEIGHBOURHOOD', columns = 'YEAR', aggfunc = 'sum').divide(area_neighbourhood.AREA, axis=0).T
ax = time_density_neighbourhood.plot(title="Crime Density through the Years")
ax.get_legend().set_bbox_to_anchor((1, 1))
data[data.NEIGHBOURHOOD == "Central Business District"]
There is a news article that blames the increase of immigration from Alberta. Is this lowkey racism?
https://www.vancourier.com/news/vpd-sees-10-year-spike-in-break-ins-to-cars-businesses-1.2370931
pd.unique(data.TYPE)
Break into serious crimes and non serious crimes
data
'Break and Enter Residential/Other', 'Mischief', 'Break and Enter Commercial'
serious_crimes = ['Other Theft', 'Break and Enter Residential/Other', 'Mischief',
'Break and Enter Commercial']
serious_crime_data = data[data.TYPE.isin(serious_crimes)]
time_density_neighbourhood = pd.pivot_table(serious_crime_data, values = 'counter', index = 'NEIGHBOURHOOD', columns = 'YEAR', aggfunc = 'sum').divide(area_neighbourhood.AREA, axis=0).T
ax = time_density_neighbourhood.plot(title="Crime Density through the Years")
ax.get_legend().set_bbox_to_anchor((1, 1))
def plot_density(data, crime_type):
subset = data[data.TYPE == crime_type]
time_density_neighbourhood = pd.pivot_table(subset, values = 'counter', index = 'NEIGHBOURHOOD', columns = 'YEAR', aggfunc = 'sum').divide(area_neighbourhood.AREA, axis=0).T
ax = time_density_neighbourhood.plot(title="Crime Density through the Years: " + str(crime_type))
ax.get_legend().set_bbox_to_anchor((1, 1))
plot_density(data, 'Other Theft')
pd.unique(data.TYPE) # you don't want Homicide or Offence Against a Person
crimes = pd.unique(data.TYPE)
crimes
known_crimes = crimes[crimes != 'Homicide']
known_crimes = known_crimes[known_crimes != 'Offence Against a Person']
known_crimes # there are 9 crimes
for crime in known_crimes:
plot_density(data, crime)
The amount of Break and Enter Residential and Vehicle Theft have all gone down consistently.
Theft of Bike, Other Theft, Break and Enter Commercial are rising.
There has ultimately been rise in crime recently peaking in 2016. This is quite strange.
Above was by year, now I'll do it by year. Looking at the trends in crimes with each month and hour day, etc.
def plot_density_month(data, crime_type):
subset = data[data.TYPE == crime_type]
time_density_neighbourhood = pd.pivot_table(subset, values = 'counter', index = 'NEIGHBOURHOOD', columns = 'MONTH', aggfunc = 'sum').divide(area_neighbourhood.AREA, axis=0).T
ax = time_density_neighbourhood.plot(title="Crime Density through the Years: " + str(crime_type))
ax.get_legend().set_bbox_to_anchor((1, 1))
for crime in known_crimes:
plot_density_month(data, crime)
Most of the crimes are consistent throughout the year, except bike theft. Bike theft rises in the summer months which makes sense. People bike more and desire bikes more in the summer.
def plot_density(data, crime_type, time_interval):
subset = data[data.TYPE == crime_type]
time_density_neighbourhood = pd.pivot_table(subset, values = 'counter', index = 'NEIGHBOURHOOD', columns = time_interval, aggfunc = 'sum').divide(area_neighbourhood.AREA, axis=0).T
ax = time_density_neighbourhood.plot(title="Crime Density through the Years: " + str(crime_type))
ax.get_legend().set_bbox_to_anchor((1, 1))
for crime in known_crimes:
plot_density(data, crime, 'DAY')
Fairly constant. There's a spike in Break and Enter Commercial, and Mischief on the 15th which is strange. It's the Stanley Cup Riot in 2011 lol. It was one June 15th.
for crime in known_crimes:
plot_density(data, crime, 'HOUR')
People steal things during the day between 10 and 20o'clock. Mischief goes through the night. Car Jacking occurs at night.
I think something important to note is that this is when reports occur, not necessarily when the actual crime occured.